/*******************************************************************************

Ryan Hill: ryan.hill@kellogg.northwestern.edu
Carolyn Stein: carolyn_stein@berkeley.edu
Last modified: December 2024

Inputs:		pdb_full_entities.dta
			pdb_full_structures.dta
			pdb_full_papers.dta
			pdb_analysis.dta

Outputs: 	phat.dta

Purpose: 	Construct a measure of potential (P in the model)


*******************************************************************************/

clear all
set maxvar 100000

* Collapse some entity variables down to structure level
use "${data_built}pdb_full_entities.dta", clear
keep structureId entityId taxonomy geneName before_pdb
	
	* Clean up gene name, keep only the mode within structure 
	split geneName, parse(#)
	replace geneName1 = lower(geneName1)
	drop geneName
	rename geneName1 geneName
	bys structureId: egen modeGeneName = mode(geneName), minmode
	
	* Check the number of multi-gene structures
	preserve
		gen diff = geneName != modeGeneName & geneName != ""
		bys structureId: egen has_diff = max(diff)
		keep structureId has_diff
		duplicates drop
		tab has_diff
	restore
	
	* Clean up taxonomy, keep only the mode within structure
	split taxonomy, parse(#)
	replace taxonomy1 = lower(taxonomy1)
	drop taxonomy
	rename taxonomy1 taxonomy
	bys structureId: egen modeTaxonomy = mode(taxonomy), minmode
	
	* Check the number of multi-taxonomy structures
	preserve
		gen diff = taxonomy != modeTaxonomy & taxonomy != ""
		bys structureId: egen has_diff = max(diff)
		keep structureId has_diff
		duplicates drop
		tab has_diff 
	restore
	
	* Collapse to structure level
	collapse (first) modeGeneName modeTaxonomy (sum) before_pdb, by(structureId)
	rename modeGeneName geneName
	rename modeTaxonomy taxonomy

* Merge in structure and paper level variables
merge 1:1 structureId using "${data_built}pdb_full_structures.dta"
assert _merge == 3
drop _merge
merge m:1 pubmedId using "${data_built}pdb_full_papers.dta"
assert _merge != 2
drop _merge

* Merge and only keep analysis sample
merge 1:1 structureId using "${data_built}pdb_analysis.dta"
assert _merge != 2
keep if _merge == 3
drop _merge
	
* Take the ptiles of citations
xtile ptileCites3 = stk_cites_nslf3, nq(100)
xtile ptileCites5 = stk_cites_nslf5, nq(100)
xtile ptileCites10 = stk_cites_nslf10, nq(100)
	
* Generate square of before PDB papers
gen before_pdb2 = before_pdb^2
	
* Clean up variables so that they can be used in a regression

	* Gene name
	replace geneName = "missing" if geneName == ""
	bys geneName: egen count = count(structureId)
	replace geneName = "other" if count <= 1
	egen groupGene = group(geneName), label
	drop count
	
	* Classification
	replace classification = lower(classification)
	replace classification = "missing" if classification == ""
	
	* Some classification cleanup using Levenshtein edit distance
	strgroup classification, gen(classificationClean) threshold(0.2) first force
	bys classificationClean: egen modeClassification = 						///
		mode(classification), minmode
	drop classification classificationClean
	rename modeClassification classification

	* Consolidate unknown category
	replace classification = "unknown function" if 							///
	classification == "structural genomics, unknown function"
	
	* Create an "other" category for singletons
	bys classification: egen count = count(structureId)
	replace classification = "other" if count <= 1
	egen groupClassification = group(classification), label
	drop count
	
	* Macro molecule type
	replace macroMoleculeType = lower(macroMoleculeType)
	replace macroMoleculeType = "missing" if macroMoleculeType == ""
	egen groupMacroMoleculeType = group(macroMoleculeType), label
	
	* Taxonomy
	replace taxonomy = "missing" if taxonomy == ""
	bys taxonomy: egen count = count(structureId)
	replace taxonomy = "other" if count <= 1
	egen groupTaxonomy = group(taxonomy), label
	drop count
	
	* Publication year (fill in if missing)
	replace publicationYear = year(releaseDate) if publicationYear == .
	
* Create a Stata log of the lasso estimation for reference
cap log close
log using "${data_built}lasso_output.log", replace
 
* Lasso estimation
rlasso ptileCites3 i.groupClassification i.groupMacroMolecule 				///
i.groupTaxonomy i.groupGene before_pdb before_pdb2 i.publicationYear,     	///
prestd center ols robust
 
* Prediction (truncate predictions at 0 / 100)
predict predPtileCites3, ols 
replace predPtileCites3 = 100 if predPtileCites3 > 100
replace predPtileCites3 = 0 if predPtileCites3 < 0
	
* Post-OLS regression and output
reg ptileCites3 `e(selected)' i.publicationYear, robust
return list
display e(r2)
preserve
	clear
	regsave
	export excel "${tables}/Tables.xlsx", sheet(tableE2) cell(D80) sheetmodify
restore 
	
log close

* Save dataset -- used in bootstrapping procedure
save "${data_built}bootstrap/phat_predictors.dta", replace
	
keep structureId ptileCites3 predPtileCites3
save "${data_built}phat.dta", replace
	

	 

	

 
	


	
